Vignette for Seurat: https://satijalab.org/seurat/articles/pbmc3k_tutorial.html
PBMC_Multiome_10xTn5_CellRangerARC/raw_feature_bc_matrix PBMC_Multiome_Tn5hc_50perc_CellRangerARC/raw_feature_bc_matrix PBMC_Multiome_Tn5hc_CellRangerARC/raw_feature_bc_matrix 20230912_PBMC_Multiome_scRNA_UMAP.ipynb
### Libraries
library(qs)
library(dplyr)
library(Seurat)
library(patchwork)
library(ggpubr)
Sys.setenv(RETICULATE_PYTHON = "miniconda3/envs/Scrublet_py3/bin/python3")
library(reticulate)
library(DoubletFinder)
##### Settings for regression during scaling of data
## BLAS Control threads:
library(RhpcBLASctl)
blas_set_num_threads(1)
omp_set_num_threads(1)
#optional
library(future)
plan("multicore",workers=9)
options(future.globals.maxSize= 5000*1024^2)
### Samples
samples <- c("PBMC_scATAC", "PBMC_scTurboATAC_16", "PBMC_scTurboATAC_13")
### Sample palette
sample_palette <- c("grey", "blue", "darkblue")
names(sample_palette) <- samples
### Data
inputFiles <- c("PBMC_Multiome_10xTn5_CellRangerARC/raw_feature_bc_matrix",
"PBMC_Multiome_Tn5hc_50perc_CellRangerARC/raw_feature_bc_matrix",
"PBMC_Multiome_Tn5hc_CellRangerARC/raw_feature_bc_matrix")
names(inputFiles) <- samples
input_data <- list()
for (sample in samples){
input_data[[sample]] <- Read10X(data.dir = inputFiles[sample])
}
Default filtering criteria for cells:
proj_list <- list()
for (sample in samples){
proj_list[[sample]] <- CreateSeuratObject(counts = input_data[[sample]][["Gene Expression"]], project = sample, min.cells = 3, min.features = 100)
proj_list[[sample]]$orig.ident <- sample
}
for (sample in samples){
proj_list[[sample]][["percent.mt"]] <- PercentageFeatureSet(proj_list[[sample]], pattern = "^MT-")
}
p1 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = x$nFeature_RNA)}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
ggtitle("Number of features") + xlab("") + ylab("nFeature_RNA") +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
p2 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = log10(x$nCount_RNA))}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) + ylim(2, NA) +
ggtitle("Number of UMI counts") + xlab("") + ylab("log10nCount_RNA") +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
p3 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = x$percent.mt)}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
ggtitle("Percent of mitochondrial reads") + xlab("") + ylab("percent_mt") +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
options(repr.plot.width=12, repr.plot.height=6)
ggarrange(p1, p2, p3, ncol = 3, nrow = 1)
plot_list <- list()
for (sample in samples){
plot_list[[paste0(sample, "_nCount_percentMT")]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "percent.mt",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") + scale_x_log10(labels = scales::comma) +
ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)")) +
ylim(c(0,80))
plot_list[[paste0(sample, "_nCount_nFeature")]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") + ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)")) +
ylim(c(0,10000))
}
options(repr.plot.width=12, repr.plot.height=5*length(samples))
ggarrange(plotlist=plot_list, ncol=2, nrow=length(samples))
summary <- lapply(samples,
function(sample){c(sample, '01_raw',
nrow(proj_list[[sample]]@meta.data),
median(proj_list[[sample]]$nFeature_RNA) %>% round(., 0),
median(proj_list[[sample]]$nCount_RNA) %>% round(., 0),
median(proj_list[[sample]]$percent.mt) %>% round(., 0))}) %>% do.call(rbind, .)
colnames(summary) <- c('sample', 'step', 'nCells', 'nFeature', 'nCount', 'percent.mt')
summary
Adapted filtering criteria for cells:
for (sample in samples){
proj_list[[sample]] <- subset(proj_list[[sample]], subset = percent.mt < 40)
}
p1 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = x$nFeature_RNA)}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
ggtitle("Number of features") + xlab("") + ylab("nFeature_RNA") +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
p2 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = log10(x$nCount_RNA))}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) + ylim(2, NA) +
ggtitle("Number of UMI counts") + xlab("") + ylab("log10nCount_RNA") +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
p3 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = x$percent.mt)}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
ggtitle("Percent of mitochondrial reads") + xlab("") + ylab("percent_mt") +
geom_hline(yintercept = c(40)) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
options(repr.plot.width=12, repr.plot.height=6)
ggarrange(p1, p2, p3, ncol = 3, nrow = 1)
plot_list <- list()
for (sample in samples){
plot_list[[paste0(sample, "_nCount_percentMT")]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "percent.mt",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") + scale_x_log10(labels = scales::comma) +
ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)")) +
ylim(c(0,80)) + geom_hline(yintercept = c(40))
plot_list[[paste0(sample, "_nCount_nFeature")]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") + ggtitle(paste(sample, " (", length(proj_list[[sample]]$orig.ident), "cells)")) +
ylim(c(0,10000))
}
options(repr.plot.width=12, repr.plot.height=5*length(samples))
ggarrange(plotlist=plot_list, ncol=2, nrow=length(samples))
summary <- lapply(samples,
function(sample){c(sample, '02A_percent.mt',
nrow(proj_list[[sample]]@meta.data),
median(proj_list[[sample]]$nFeature_RNA) %>% round(., 0),
median(proj_list[[sample]]$nCount_RNA) %>% round(., 0),
median(proj_list[[sample]]$percent.mt) %>% round(., 0))}) %>% do.call(rbind, .) %>% rbind(summary, .)
colnames(summary) <- c('sample', 'step', 'nCells', 'nFeature', 'nCount', 'percent.mt')
summary
Adapted filtering criteria for cells:
for (sample in samples){
proj_list[[sample]] <- subset(proj_list[[sample]], subset = nCount_RNA > 500)
}
p1 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = x$nFeature_RNA)}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
ggtitle("Number of features") + xlab("") + ylab("nFeature_RNA") +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
p2 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = log10(x$nCount_RNA))}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
ggtitle("Number of UMI counts") + xlab("") + ylab("log10nCount_RNA") +
geom_hline(yintercept = c(log10(500))) + ylim(2, NA) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
p3 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = x$percent.mt)}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
ggtitle("Percent of mitochondrial reads") + xlab("") + ylab("percent_mt") +
geom_hline(yintercept = c(40)) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
options(repr.plot.width=12, repr.plot.height=6)
ggarrange(p1, p2, p3, ncol = 3, nrow = 1)
plot_list <- list()
for (sample in samples){
plot1 <- lapply(unique(proj_list[[sample]]$orig.ident),
function(x){FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "percent.mt",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") + scale_x_log10() +
ggtitle(paste(x, " (", length(proj_list[[sample]]$orig.ident), "cells)")) +
ylim(c(0,80)) + geom_hline(yintercept = c(40)) + geom_vline(xintercept = c(500)) }) %>%
ggarrange(plotlist=., ncol=1, nrow=1)
plot2 <- lapply(unique(proj_list[[sample]]$orig.ident),
function(x){FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") + ggtitle(paste(x, " (", length(proj_list[[sample]]$orig.ident), "cells)")) +
ylim(c(0,10000)) + geom_vline(xintercept = c(500)) }) %>%
ggarrange(plotlist=., ncol=1, nrow=1)
plot_list[[sample]] <- ggarrange(plot1, plot2, ncol=1, nrow=2)
}
options(repr.plot.width=20, repr.plot.height=14)
ggarrange(plotlist=plot_list, ncol=3, nrow=1)
summary <- lapply(samples,
function(sample){c(sample, '02B_percent.mt_nCount',
nrow(proj_list[[sample]]@meta.data),
median(proj_list[[sample]]$nFeature_RNA) %>% round(., 0),
median(proj_list[[sample]]$nCount_RNA) %>% round(., 0),
median(proj_list[[sample]]$percent.mt) %>% round(., 0))}) %>% do.call(rbind, .) %>% rbind(summary, .)
colnames(summary) <- c('sample', 'step', 'nCells', 'nFeature', 'nCount', 'percent.mt')
summary
plot_list <- list()
for (sample in samples){
proj_list[[sample]] <- NormalizeData(proj_list[[sample]], normalization.method = "LogNormalize", scale.factor = 10000)
proj_list[[sample]] <- ScaleData(proj_list[[sample]], features = rownames(proj_list[[sample]]))
proj_list[[sample]] <- FindVariableFeatures(proj_list[[sample]], selection.method = "vst", nfeatures = 2000)
proj_list[[sample]] <- RunPCA(proj_list[[sample]], features = VariableFeatures(proj_list[[sample]]))
plot_list[[sample]] <- ElbowPlot(proj_list[[sample]]) + ggtitle(sample)
}
options(repr.plot.width=5*length(samples), repr.plot.height=5)
ggpubr::ggarrange(plotlist = plot_list, ncol=length(samples))
PC_vector <- c(20, 20, 20)
names(PC_vector) <- samples
for (sample in samples){
proj_list[[sample]] <- FindNeighbors(proj_list[[sample]], dims = 1:PC_vector[sample])
proj_list[[sample]] <- RunUMAP(proj_list[[sample]], dims = 1:PC_vector[sample])
}
proj_list <- lapply(samples, function(sample){mat <- GetAssayData(object = proj_list[[sample]], assay = "RNA", slot = "counts");
reticulate::use_condaenv("/home/bq_ilander/miniconda3/envs/Scrublet_py3/bin/python3");
reticulate::import("os")
Matrix::writeMM(mat, file="count_matrix")
source_python("/media/ag-rippe2/isabelle/_PythonScripts/Scrublet.py")
x <- as.numeric(as.data.frame(data.table::fread("count_matrix.doubletScores", sep = ",", header = F, skip = 1))[, 2])
names(x) <- colnames(mat)
file.remove("count_matrix")
file.remove("count_matrix.doubletScores")
proj_list[[sample]] <- AddMetaData(proj_list[[sample]], x, "Scrublet_Score")
return(proj_list[[sample]])
})
names(proj_list) <- samples
plot_list <- list()
options(repr.plot.width=5*length(samples), repr.plot.height=6)
par(mfrow=c(1,length(samples)))
for (sample in samples){
hist(proj_list[[sample]]$Scrublet_Score, breaks=50, main = sample)
abline(v = c(0.15), col="red")
}
for (sample in samples){
print(sample)
table(proj_list[[sample]]$Scrublet_Score >= 0.15) %>% print(.)
proj_list[[sample]]$Doublets <- proj_list[[sample]]$Scrublet_Score >= 0.15
}
plot_list <- list()
for (sample in samples){
plot_list[["Scrublet_Score"]][[sample]] <- FeaturePlot(proj_list[[sample]], "Scrublet_Score") + ggtitle(paste(sample, "- Scrublet Scores"))
plot_list[["Doublets"]][[sample]] <- FeaturePlot(proj_list[[sample]], "Doublets") + ggtitle(paste(sample, "- Doublets (cutoff 0.15)"))
plot_list[["Scatter_nCount"]][[sample]] <- FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "Scrublet_Score", group.by = "Doublets") + ggtitle(paste(sample, "- Doublets (0.15)"))
plot_list[["Scatter_nFeature"]][[sample]] <- FeatureScatter(proj_list[[sample]], feature1 = "nFeature_RNA", feature2 = "Scrublet_Score", group.by = "Doublets") + ggtitle(paste(sample, "- Doublets (0.15)"))
plot_list[["Scatter_percent.mt"]][[sample]] <- FeatureScatter(proj_list[[sample]], feature1 = "percent.mt", feature2 = "Scrublet_Score", group.by = "Doublets") + ggtitle(paste(sample, "- Doublets (0.15)"))
}
options(repr.plot.width=6*length(samples), repr.plot.height=5)
ggarrange(plotlist = plot_list[["Scrublet_Score"]], ncol=length(samples))
ggarrange(plotlist = plot_list[["Doublets"]], ncol=length(samples))
ggarrange(plotlist = plot_list[["Scatter_nCount"]], ncol=length(samples))
ggarrange(plotlist = plot_list[["Scatter_nFeature"]], ncol=length(samples))
ggarrange(plotlist = plot_list[["Scatter_percent.mt"]], ncol=length(samples))
DF_params <- list()
for (sample in samples){
print(sample)
y <- paramSweep_v3(proj_list[[sample]], PCs = 1:PC_vector[sample], sct = FALSE, num.cores = 10)
y <- summarizeSweep(y, GT = FALSE)
DF_params[[sample]] <- find.pK(y)
}
# Optimal pK for any scRNA-seq data can be manually discerned as maxima in BCmvn distributions.
pK_vector <- character()
for (sample in samples){
pK_vector[sample] <- as.character(DF_params[[sample]]$pK[DF_params[[sample]]$BCmetric == max(DF_params[[sample]]$BCmetric)])
}
pK_vector <- as.numeric(pK_vector)
names(pK_vector) <- samples
for (sample in samples){
proj_list[[sample]] <- doubletFinder_v3(proj_list[[sample]], PCs = 1:PC_vector[sample], pK = pK_vector[sample],
nExp = round(0.046*nrow(proj_list[[sample]]@meta.data)), reuse.pANN = FALSE, sct = FALSE)
}
plot_list <- list()
for (sample in samples){
plot_list[["DoubletFinder"]][[sample]] <- DimPlot(proj_list[[sample]],
group.by = grep("DF.classifications", colnames(proj_list[[sample]]@meta.data), value = TRUE),
cols = c("red", "grey")) +
ggtitle(paste(sample, "- DoubletFinder"))
}
for (sample in samples){
print(sample)
print(table(proj_list[[sample]]@meta.data[, grep("DF.classifications", colnames(proj_list[[sample]]@meta.data), value = TRUE)]))
}
options(repr.plot.width=6*length(samples), repr.plot.height=5)
ggpubr::ggarrange(plotlist = plot_list[["DoubletFinder"]], ncol=length(samples))
for (sample in samples){
proj_list[[sample]] <- subset(proj_list[[sample]], subset = Doublets == FALSE)
}
p1 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = x$nFeature_RNA)}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
ggtitle("Number of features") + xlab("") + ylab("nFeature_RNA") +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
p2 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = log10(x$nCount_RNA))}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
ggtitle("Number of UMI counts") + xlab("") + ylab("log10nCount_RNA") +
geom_hline(yintercept = c(log10(500))) + ylim(2, NA) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
p3 <- ggplot(lapply(proj_list, function(x){data.frame(Sample = factor(x$orig.ident, levels = samples),
y = x$percent.mt)}) %>% do.call(rbind, .),
aes(x = Sample, y = y)) +
geom_violin(aes(fill = Sample), alpha = 0.8) + scale_fill_manual(values = sample_palette) +
geom_boxplot(alpha = 0.001) +
ggtitle("Percent of mitochondrial reads") + xlab("") + ylab("percent_mt") +
geom_hline(yintercept = c(40)) +
theme_minimal() + theme(legend.position = "none", plot.title = element_text(size=14),
axis.title = element_text(size=12), axis.text = element_text(size=12),
axis.text.x=element_text(angle=45, vjust=1, hjust=1),
legend.title = element_text(size=12), legend.text = element_text(size=12))
options(repr.plot.width=12, repr.plot.height=6)
ggarrange(p1, p2, p3, ncol = 3, nrow = 1)
plot_list <- list()
for (sample in samples){
plot1 <- lapply(unique(proj_list[[sample]]$orig.ident),
function(x){FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "percent.mt",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") + scale_x_log10() +
ggtitle(paste(x, " (", length(proj_list[[sample]]$orig.ident), "cells)")) +
ylim(c(0,80)) + geom_hline(yintercept = c(40)) + geom_vline(xintercept = c(500)) }) %>%
ggarrange(plotlist=., ncol=1, nrow=1)
plot2 <- lapply(unique(proj_list[[sample]]$orig.ident),
function(x){FeatureScatter(proj_list[[sample]], feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
cols = sample_palette, group.by = "orig.ident") +
theme(legend.position="none") + ggtitle(paste(x, " (", length(proj_list[[sample]]$orig.ident), "cells)")) +
ylim(c(0,10000)) + geom_vline(xintercept = c(500)) }) %>%
ggarrange(plotlist=., ncol=1, nrow=1)
plot_list[[sample]] <- ggarrange(plot1, plot2, ncol=1, nrow=2)
}
options(repr.plot.width=20, repr.plot.height=14)
ggarrange(plotlist=plot_list, ncol=3, nrow=1)
summary <- lapply(samples,
function(sample){c(sample, '03_percent.mt_nCount_doublets',
nrow(proj_list[[sample]]@meta.data),
median(proj_list[[sample]]$nFeature_RNA) %>% round(., 0),
median(proj_list[[sample]]$nCount_RNA) %>% round(., 0),
median(proj_list[[sample]]$percent.mt) %>% round(., 0))}) %>% do.call(rbind, .) %>% rbind(summary, .)
colnames(summary) <- c('sample', 'step', 'nCells', 'nFeature', 'nCount', 'percent.mt')
summary
for (sample in samples){
proj_list[[sample]] <- NormalizeData(proj_list[[sample]], normalization.method = "LogNormalize", scale.factor = 10000)
proj_list[[sample]] <- ScaleData(proj_list[[sample]], features = rownames(proj_list[[sample]]))
}
for (sample in samples){
proj_list[[sample]] <- SCTransform(proj_list[[sample]])
}
plot_list <- list()
for (sample in samples){
proj_list[[sample]] <- FindVariableFeatures(proj_list[[sample]], assay = "RNA", selection.method = "vst", nfeatures = 2000)
proj_list[[sample]] <- FindVariableFeatures(proj_list[[sample]], assay = "SCT", selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10_RNA <- head(VariableFeatures(proj_list[[sample]], assay = "RNA"), 10)
top10_SCT <- head(VariableFeatures(proj_list[[sample]], assay = "SCT"), 10)
# plot variable features with and without labels
plot_list[[paste0(sample, "_RNA")]] <- VariableFeaturePlot(proj_list[[sample]], assay = "RNA") %>%
LabelPoints(plot = ., points = top10_RNA, repel = TRUE)
plot_list[[paste0(sample, "_SCT")]] <- VariableFeaturePlot(proj_list[[sample]], assay = "SCT") %>%
LabelPoints(plot = ., points = top10_SCT, repel = TRUE)
}
options(repr.plot.width=20, repr.plot.height=6*length(samples))
ggarrange(plotlist = plot_list, ncol = 2, nrow = length(samples))
plot_list <- list()
for (sample in samples){
proj_list[[sample]] <- RunPCA(proj_list[[sample]], assay = "RNA", reduction.name = "pca_RNA",
features = VariableFeatures(proj_list[[sample]], assay = "RNA"))
proj_list[[sample]] <- RunPCA(proj_list[[sample]], assay = "SCT", reduction.name = "pca_SCT",
features = VariableFeatures(proj_list[[sample]], assay = "SCT"))
plot_list[[paste0(sample, "_RNA")]] <- ElbowPlot(proj_list[[sample]], reduction = "pca_RNA", ndims = 50) + ggtitle(sample)
plot_list[[paste0(sample, "_SCT")]] <- ElbowPlot(proj_list[[sample]], reduction = "pca_SCT", ndims = 50) + ggtitle(sample)
}
options(repr.plot.width=12, repr.plot.height=5*length(samples))
ggarrange(plotlist = plot_list, ncol = 2, nrow = length(samples))
PCs_list <- rep(20, 6)
names(PCs_list) <- paste0(rep(samples, each=2), c("_RNA", "_SCT"))
for (sample in samples){
proj_list[[sample]] <- FindNeighbors(proj_list[[sample]], assay = "RNA", reduction = "pca_RNA",
dims = 1:PCs_list[[paste0(sample, "_RNA")]], graph.name = "RNA_snn")
proj_list[[sample]] <- FindNeighbors(proj_list[[sample]], assay = "SCT", reduction = "pca_SCT",
dims = 1:PCs_list[[paste0(sample, "_SCT")]], graph.name = "SCT_snn")
for (resolution in c(0.2, 0.5, 0.8)){
proj_list[[sample]] <- FindClusters(proj_list[[sample]], resolution = resolution, graph.name = "RNA_snn")
proj_list[[sample]] <- FindClusters(proj_list[[sample]], resolution = resolution, graph.name = "SCT_snn")
}
}
for (sample in samples){
proj_list[[sample]] <- RunUMAP(proj_list[[sample]], assay = "RNA", reduction = "pca_RNA",
dims = 1:PCs_list[[paste0(sample, "_RNA")]], reduction.name = "umap_RNA")
proj_list[[sample]] <- RunUMAP(proj_list[[sample]], assay = "SCT", reduction = "pca_SCT",
dims = 1:PCs_list[[paste0(sample, "_SCT")]], reduction.name = "umap_SCT")
}
plot_list <- list()
for (sample in samples){
plot_list[[sample]] <- list()
plot_list[[sample]]$clusters0.2 <- DimPlot(proj_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.2")
plot_list[[sample]]$clusters0.5 <- DimPlot(proj_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.5")
plot_list[[sample]]$clusters0.8 <- DimPlot(proj_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.8")
plot_list[[sample]]$nCount <- FeaturePlot(proj_list[[sample]], reduction = "umap_RNA", features = "nCount_RNA")
plot_list[[sample]]$nFeature <- FeaturePlot(proj_list[[sample]], reduction = "umap_RNA", features = "nFeature_RNA")
plot_list[[sample]]$percentMT <- FeaturePlot(proj_list[[sample]], reduction = "umap_RNA", features = "percent.mt")
}
options(repr.plot.width=20, repr.plot.height=12)
for (sample in samples){
ggarrange(plotlist = plot_list[[sample]], ncol = 3, nrow = 2, labels = sample) %>% print(.)
}
plot_list <- list()
for (sample in samples){
plot_list[[sample]] <- list()
plot_list[[sample]]$clusters0.2 <- DimPlot(proj_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.2")
plot_list[[sample]]$clusters0.5 <- DimPlot(proj_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.5")
plot_list[[sample]]$clusters0.8 <- DimPlot(proj_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.8")
plot_list[[sample]]$nCount <- FeaturePlot(proj_list[[sample]], reduction = "umap_SCT", features = "nCount_RNA")
plot_list[[sample]]$nFeature <- FeaturePlot(proj_list[[sample]], reduction = "umap_SCT", features = "nFeature_RNA")
plot_list[[sample]]$percentMT <- FeaturePlot(proj_list[[sample]], reduction = "umap_SCT", features = "percent.mt")
}
options(repr.plot.width=20, repr.plot.height=12)
for (sample in samples){
ggarrange(plotlist = plot_list[[sample]], ncol = 3, nrow = 2, labels = sample) %>% print(.)
}
# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat. We can
# segregate this list into markers of G2/M phase and markers of S phase
# Filter out genes that are not in Seurat object
s.genes_list <- list()
g2m.genes_list <- list()
for (sample in samples){
s.genes_list[[paste0(sample, "_RNA")]] <- cc.genes$s.genes[cc.genes$s.genes %in% rownames(proj_list[[sample]]@assays$RNA@meta.features)]
g2m.genes_list[[paste0(sample, "_RNA")]] <- cc.genes$g2m.genes[cc.genes$g2m.genes %in% rownames(proj_list[[sample]]@assays$RNA@meta.features)]
s.genes_list[[paste0(sample, "_SCT")]] <- cc.genes$s.genes[cc.genes$s.genes %in% rownames(proj_list[[sample]]@assays$SCT@meta.features)]
g2m.genes_list[[paste0(sample, "_SCT")]] <- cc.genes$g2m.genes[cc.genes$g2m.genes %in% rownames(proj_list[[sample]]@assays$SCT@meta.features)]
}
for (sample in samples){
proj_list[[sample]] <- CellCycleScoring(proj_list[[sample]], assay = "RNA",
s.features = s.genes_list[[paste0(sample, "_RNA")]],
g2m.features = g2m.genes_list[[paste0(sample, "_RNA")]], set.ident = TRUE)
colnames(proj_list[[sample]]@meta.data)[grep("^S.Score$|^G2M.Score$|^Phase$",
colnames(proj_list[[sample]]@meta.data))] <- paste0(c("S.Score", "G2M.Score", "Phase"),
"_RNA")
proj_list[[sample]] <- CellCycleScoring(proj_list[[sample]], assay = "SCT",
s.features = s.genes_list[[paste0(sample, "_SCT")]],
g2m.features = g2m.genes_list[[paste0(sample, "_SCT")]], set.ident = TRUE)
colnames(proj_list[[sample]]@meta.data)[grep("^S.Score$|^G2M.Score$|^Phase$",
colnames(proj_list[[sample]]@meta.data))] <- paste0(c("S.Score", "G2M.Score", "Phase"),
"_SCT")
proj_list[[sample]] <- RunPCA(proj_list[[sample]], assay = "RNA",
features = c(s.genes_list[[paste0(sample, "_RNA")]], g2m.genes_list[[paste0(sample, "_RNA")]]),
reduction.name = "pca_RNA_s.g2m.genes")
proj_list[[sample]] <- RunPCA(proj_list[[sample]], assay = "SCT",
features = c(s.genes_list[[paste0(sample, "_SCT")]], g2m.genes_list[[paste0(sample, "_SCT")]]),
reduction.name = "pca_SCT_s.g2m.genes")
}
for (sample in samples){
print(sample)
print("RNA")
table(proj_list[[sample]]$Phase_RNA) %>% print(.)
print("SCT")
table(proj_list[[sample]]$Phase_SCT) %>% print(.)
}
plot_list <- list()
for (sample in samples){
p1 <- DimPlot(proj_list[[sample]], reduction = "pca_RNA_s.g2m.genes") + ggtitle(sample)
p2 <- DimPlot(proj_list[[sample]], reduction = "umap_RNA", group.by ="Phase_RNA")
p3 <- DimPlot(proj_list[[sample]], reduction = "pca_SCT_s.g2m.genes")
p4 <- DimPlot(proj_list[[sample]], reduction = "umap_SCT", group.by = "Phase_SCT")
plot_list[[sample]] <- ggarrange(p1, p2, p3, p4, ncol=4)
}
options(repr.plot.width=20, repr.plot.height=5*length(samples))
ggarrange(plotlist = plot_list, ncol = 1, nrow = length(samples))
projCC_list <- list()
for (sample in samples){
projCC_list[[sample]] <- ScaleData(proj_list[[sample]], assay = "RNA",
vars.to.regress = c("S.Score_RNA", "G2M.Score_RNA"),
features = rownames(proj_list[[sample]]))
projCC_list[[sample]] <- ScaleData(projCC_list[[sample]], assay = "SCT",
vars.to.regress = c("S.Score_SCT", "G2M.Score_SCT"),
features = rownames(projCC_list[[sample]]))
}
plot_list <- list()
for (sample in samples){
projCC_list[[sample]] <- RunPCA(projCC_list[[sample]], assay = "RNA",
features = c(s.genes_list[[paste0(sample, "_RNA")]], g2m.genes_list[[paste0(sample, "_RNA")]]),
reduction.name = "pca_RNA_s.g2m.genes")
projCC_list[[sample]] <- RunPCA(projCC_list[[sample]], assay = "SCT",
features = c(s.genes_list[[paste0(sample, "_SCT")]], g2m.genes_list[[paste0(sample, "_SCT")]]),
reduction.name = "pca_SCT_s.g2m.genes")
plot_list[[paste0(sample, "_RNA")]] <- DimPlot(projCC_list[[sample]], reduction = "pca_RNA_s.g2m.genes", group.by = "Phase_RNA")
plot_list[[paste0(sample, "_SCT")]] <- DimPlot(projCC_list[[sample]], reduction = "pca_SCT_s.g2m.genes", group.by = "Phase_SCT")
}
options(repr.plot.width=10, repr.plot.height=5*length(samples))
ggarrange(plotlist = plot_list, ncol = 2, nrow = length(samples))
plot_list <- list()
for (sample in samples){
projCC_list[[sample]] <- RunPCA(projCC_list[[sample]], assay = "RNA", reduction.name = "pca_RNA",
features = VariableFeatures(projCC_list[[sample]], assay = "RNA"))
projCC_list[[sample]] <- RunPCA(projCC_list[[sample]], assay = "SCT", reduction.name = "pca_SCT",
features = VariableFeatures(projCC_list[[sample]], assay = "SCT"))
plot_list[[paste0(sample, "_RNA")]] <- ElbowPlot(projCC_list[[sample]], reduction = "pca_RNA", ndims = 50) + ggtitle(sample)
plot_list[[paste0(sample, "_SCT")]] <- ElbowPlot(projCC_list[[sample]], reduction = "pca_SCT", ndims = 50) + ggtitle(sample)
}
options(repr.plot.width=12, repr.plot.height=5*length(samples))
ggarrange(plotlist = plot_list, ncol = 2, nrow = length(samples))
PCs_list <- rep(25, 6)
names(PCs_list) <- paste0(rep(samples, each=2), c("_RNA", "_SCT"))
for (sample in samples){
projCC_list[[sample]] <- FindNeighbors(projCC_list[[sample]], assay = "RNA", reduction = "pca_RNA",
dims = 1:PCs_list[[paste0(sample, "_RNA")]], graph.name = "RNA_snn")
projCC_list[[sample]] <- FindNeighbors(projCC_list[[sample]], assay = "SCT", reduction = "pca_SCT",
dims = 1:PCs_list[[paste0(sample, "_SCT")]], graph.name = "SCT_snn")
for (resolution in c(0.2, 0.5, 0.8)){
projCC_list[[sample]] <- FindClusters(projCC_list[[sample]], resolution = resolution, graph.name = "RNA_snn")
projCC_list[[sample]] <- FindClusters(projCC_list[[sample]], resolution = resolution, graph.name = "SCT_snn")
}
}
for (sample in samples){
projCC_list[[sample]] <- RunUMAP(projCC_list[[sample]], assay = "RNA", reduction = "pca_RNA",
dims = 1:PCs_list[[paste0(sample, "_RNA")]], reduction.name = "umap_RNA")
projCC_list[[sample]] <- RunUMAP(projCC_list[[sample]], assay = "SCT", reduction = "pca_SCT",
dims = 1:PCs_list[[paste0(sample, "_SCT")]], reduction.name = "umap_SCT")
}
plot_list <- list()
for (sample in samples){
plot_list[[sample]] <- list()
plot_list[[sample]]$clusters0.2 <- DimPlot(projCC_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.2")
plot_list[[sample]]$clusters0.5 <- DimPlot(projCC_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.5")
plot_list[[sample]]$clusters0.8 <- DimPlot(projCC_list[[sample]], reduction = "umap_RNA", group.by = "RNA_snn_res.0.8")
plot_list[[sample]]$Cellcycle <- DimPlot(projCC_list[[sample]], reduction = "umap_RNA", group.by = "Phase_RNA")
plot_list[[sample]]$nCount <- FeaturePlot(projCC_list[[sample]], reduction = "umap_RNA", features = "nCount_RNA")
plot_list[[sample]]$nFeature <- FeaturePlot(projCC_list[[sample]], reduction = "umap_RNA", features = "nFeature_RNA")
plot_list[[sample]]$percentMT <- FeaturePlot(projCC_list[[sample]], reduction = "umap_RNA", features = "percent.mt")
}
options(repr.plot.width=25, repr.plot.height=12)
for (sample in samples){
ggarrange(plotlist = plot_list[[sample]], ncol = 4, nrow = 2, labels = sample) %>% print(.)
}
plot_list <- list()
for (sample in samples){
plot_list[[sample]] <- list()
plot_list[[sample]]$clusters0.2 <- DimPlot(projCC_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.2")
plot_list[[sample]]$clusters0.5 <- DimPlot(projCC_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.5")
plot_list[[sample]]$clusters0.8 <- DimPlot(projCC_list[[sample]], reduction = "umap_SCT", group.by = "SCT_snn_res.0.8")
plot_list[[sample]]$Cellcycle <- DimPlot(projCC_list[[sample]], reduction = "umap_SCT", group.by = "Phase_SCT")
plot_list[[sample]]$nCount <- FeaturePlot(projCC_list[[sample]], reduction = "umap_SCT", features = "nCount_RNA")
plot_list[[sample]]$nFeature <- FeaturePlot(projCC_list[[sample]], reduction = "umap_SCT", features = "nFeature_RNA")
plot_list[[sample]]$percentMT <- FeaturePlot(projCC_list[[sample]], reduction = "umap_SCT", features = "percent.mt")
}
options(repr.plot.width=25, repr.plot.height=12)
for (sample in samples){
ggarrange(plotlist = plot_list[[sample]], ncol = 4, nrow = 2, labels = sample) %>% print(.)
}
### Save Seurat objects
qsave(proj_list, 'SeuratObjects.qs')
qsave(projCC_list, 'SeuratObjects_CCcorrected.qs')
### Save Sample statistics
write.table(summary, 'Summary.csv', sep=',', col.names = T, row.names = F)
lapply(samples, function(sample){all(rownames(proj_list[[sample]]@meta.data) == rownames(proj_list[[sample]]@reductions$umap_RNA@cell.embeddings) &
rownames(proj_list[[sample]]@meta.data) == rownames(proj_list[[sample]]@reductions$umap_SCT@cell.embeddings) &
rownames(proj_list[[sample]]@meta.data) == rownames(projCC_list[[sample]]@meta.data) &
rownames(proj_list[[sample]]@meta.data) == rownames(projCC_list[[sample]]@reductions$umap_RNA@cell.embeddings) &
rownames(proj_list[[sample]]@meta.data) == rownames(projCC_list[[sample]]@reductions$umap_SCT@cell.embeddings)) %>% print(.);
df <- cbind(proj_list[[sample]]@meta.data,
proj_list[[sample]]@reductions$umap_RNA@cell.embeddings,
proj_list[[sample]]@reductions$umap_SCT@cell.embeddings,
projCC_list[[sample]]@reductions$umap_RNA@cell.embeddings,
projCC_list[[sample]]@reductions$umap_SCT@cell.embeddings)
colnames(df) <- c(colnames(proj_list[[sample]]@meta.data),
colnames(proj_list[[sample]]@reductions$umap_RNA@cell.embeddings),
colnames(proj_list[[sample]]@reductions$umap_SCT@cell.embeddings),
paste0(colnames(projCC_list[[sample]]@reductions$umap_RNA@cell.embeddings), "_CCcorrected"),
paste0(colnames(projCC_list[[sample]]@reductions$umap_SCT@cell.embeddings), "_CCcorrected"))
write.csv(df, paste0("RawData_", sample, ".csv"))})
Isabelle Seufert
Division of Chromatin Networks, DKFZ
12.09.2023
sessionInfo()